In [1]:
import os
import nltk
import math
import time
import gensim
import pickle
import random
import unicodedata
import numpy as np
from itertools import groupby
from operator import itemgetter
from nltk.corpus import wordnet as wn
from nltk.corpus.reader.api import CorpusReader
from nltk.corpus.reader.api import CategorizedCorpusReader
from nltk.cluster import KMeansClusterer, euclidean_distance
In [2]:
# Module Variables
ROOT = os.getcwd()
CORPUS = os.path.join(ROOT, "fixtures", "tagged")
In [3]:
PKL_PATTERN = r'(?!\.)[a-z_\s]+/[a-f0-9]+\.pickle'
CAT_PATTERN = r'([a-z_\s]+)/.*'
class BaleenCorpusReader(CategorizedCorpusReader, CorpusReader):
"""
Quick reader for the preprocessed tokenized and tagged version of the corpus.
"""
def __init__(self, root, fileids=PKL_PATTERN, categoryids=CAT_PATTERN):
"""
Initialize the corpus reader. Categorization arguments
(``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
the ``CategorizedCorpusReader`` constructor. The remaining arguments
are passed to the ``CorpusReader`` constructor.
"""
CategorizedCorpusReader.__init__(self, {"cat_pattern": categoryids})
CorpusReader.__init__(self, root, fileids)
def _resolve(self, fileids, categories):
"""
Returns a list of fileids or categories depending on what is passed
to each internal corpus reader function. This primarily bubbles up to
the high level ``docs`` method, but is implemented here similar to
the nltk ``CategorizedPlaintextCorpusReader``.
"""
if fileids is not None and categories is not None:
raise ValueError("Specify fileids or categories, not both")
if categories is not None:
return self.fileids(categories)
return fileids
def docs(self, fileids=None, categories=None):
"""
Returns the document loaded from a pickled object for every file in
the corpus. Similar to the BaleenCorpusReader, this uses a generator
to acheive memory safe iteration.
"""
# Resolve the fileids and the categories
fileids = self._resolve(fileids, categories)
# Create a generator, loading one document into memory at a time.
for path, enc, fileid in self.abspaths(fileids, True, True):
with open(path, 'rb') as f:
yield pickle.load(f)
def paras(self, fileids=None, categories=None):
"""
Returns a generator of paragraphs where each paragraph is a list of
sentences, which is in turn a list of (token, tag) tuples.
"""
for doc in self.docs(fileids, categories):
for paragraph in doc:
yield paragraph
def sents(self, fileids=None, categories=None):
"""
Returns a generator of sentences where each sentence is a list of
(token, tag) tuples.
"""
for paragraph in self.paras(fileids, categories):
for sentence in paragraph:
yield sentence
def words(self, fileids=None, categories=None):
"""
Returns a generator of (token, tag) tuples.
"""
for sentence in self.sents(fileids, categories):
for token in sentence:
yield token
In [4]:
corpus = BaleenCorpusReader(CORPUS)
In [5]:
# Optional to run
# Print statistics about each category.
words = nltk.ConditionalFreqDist([
(category, word)
for category in corpus.categories()
for word in corpus.words(categories=category)
])
for category, dist in words.items():
wc = sum(dist.values())
vb = len(dist)
print("{} has {:>,} vocab and {:>,} words".format(category, vb, wc))
In [6]:
def labeled_documents(corpus, categories=None):
"""
Returns a list of (document, label) tuples where a document is a
list of (token, tag) pairs and label is the supervised classes.
"""
# Get the total list of categories
categories = categories or corpus.categories()
# Build a list of the documents with their associated words
# Note this loads the entire corpus into memory!
return [
(
list(corpus.words(fileids=fileid)),
corpus.categories(fileids=fileid)[0]
)
for fileid in corpus.fileids(categories=categories)
]
def train_test_split(docs, categories=None, test=0.2):
"""
Build a training and testing set of documents with their associated
labels by shuffling the documents, then returning test% and 1-test%
of the data set (e.g. the test and train sets).
"""
# Shuffle the document in place
random.shuffle(docs)
# Find the split index
split = math.floor(len(docs)*test)
# Return the train/test based on the split
return docs[split:], docs[:split]
In [6]:
# train, test = train_test_split(labeled_documents(corpus))
In [7]:
STOPWORDS = set(nltk.corpus.stopwords.words('english'))
lemmatizer = nltk.WordNetLemmatizer()
def is_punct(token):
# Is every character punctuation?
return all(unicodedata.category(char).startswith('P') for char in token)
def wnpos(tag):
# Return the WordNet POS tag from the Penn Treebank tag
return {
'N': wn.NOUN,
'V': wn.VERB,
'R': wn.ADV,
'J': wn.ADJ
}.get(tag[0], wn.NOUN)
def normalize(document, stopwords=STOPWORDS):
"""
Removes stopwords and punctuation, lowercases, lemmatizes
"""
for token, tag in document:
token = token.lower().strip()
if is_punct(token) or (token in stopwords):
continue
yield lemmatizer.lemmatize(token, wnpos(tag))
def extract_bow_features(documents):
"""
Perform bag of words feature extraction
"""
for doc, label in documents:
yield {
"contains(\"{}\")".format(token): True
for token in normalize(doc)
}, label
def extract_tfidf_features(documents):
"""
Perform TF-IDF feature extraction for a list of (document, label) pairs.
"""
# Separate the labels from the documents
labels = [label for _, label in documents]
documents = [list(normalize(document)) for document, _ in documents]
# Create the word index mapping
lexicon = gensim.corpora.Dictionary(documents)
# Vectorize each document and create the TF-IDF model
documents = [lexicon.doc2bow(doc) for doc in documents]
tfidf = gensim.models.TfidfModel(documents, normalize=True)
# Note that you can save both the tfidf model and the lexicon to disk
# in order to load them later to featurize new documents. E.g.
# lexicon.save_as_text(os.path.join(ROOT, "fixtures", "baleen.lexicon")
# tfidf.save(os.path.join(ROOT, "fixtures", "baleen.tfidf_model")
for idx, vector in enumerate(documents):
# Compute the TF-IDF scores for the document as a map
dvec = dict(tfidf[vector])
# Create the feature dictionary to use in an NLTK classifier
yield ({
"tfidf(\"{}\")".format(token): dvec.get(tid, 0.0)
for tid, token in lexicon.items()
}, labels[idx])
In [8]:
# Featurize the corpus and create train test sets.
start = time.time()
documents = extract_bow_features(labeled_documents(corpus, categories=['design', 'books']))
train, test = train_test_split(list(documents))
print("Feature extraction took {:0.3f} seconds".format(time.time() - start))
In [9]:
# Train the Naive Bayes Classifier
# The classifier expects a list of (feature, label) pairs
# where the features are a dictionary of text features.
start = time.time()
classifier = nltk.NaiveBayesClassifier.train(train)
print("Training Naive Bayes took {:0.3f} seconds".format(time.time()-start))
# Write the Naive Bayes Classifier to disk to use later
with open(os.path.join(ROOT, 'fixtures', 'nbayes-{}.pickle'.format(start)), 'wb') as f:
pickle.dump(classifier, f)
# Show the accuracy of the classifier on the test set
accuracy = nltk.classify.accuracy(classifier, test)
print("Naive Bayes accuracy: {}".format(accuracy))
# Show the 30 most informative features
classifier.show_most_informative_features(30)
In [10]:
# Train the Maximum Entropy Classifier
start = time.time()
classifier = nltk.MaxentClassifier.train(train,
algorithm='megam', trace=2, gaussian_prior_sigma=1)
print("Training Maximum Entropy took {:0.3f} seconds".format(time.time()-start))
# Write the Naive Bayes Classifier to disk to use later
with open(os.path.join(ROOT, 'fixtures', 'maxent-{}.pickle'.format(start)), 'wb') as f:
pickle.dump(classifier, f)
# Show the accuracy of the classifier on the test set
accuracy = nltk.classify.accuracy(classifier, test)
print("Maximum Entropy accuracy: {}".format(accuracy))
# Show the 30 most informative features
classifier.show_most_informative_features(30)
In [11]:
class KMeansTopics(object):
def __init__(self, corpus, k=10):
"""
Expects a corpus object, no need to use labeled_documents
"""
self.k = k
self.vocab = list(set(normalize(corpus.words(categories=['design', 'books']))))
self.model = None
def vectorize(self, document):
"""
Vectorizes a document consisting of a list of part of speech
tagged tokens using the segmentation and tokenization methods.
One hot encode the set of documents for K-Means clustering
"""
features = set(normalize(document))
return np.array([token in features for token in self.vocab], np.short)
def cluster(self, corpus):
"""
Fits the K-Means model to the given data.
"""
start = time.time()
self.model = KMeansClusterer(self.k, euclidean_distance, avoid_empty_clusters=True)
self.model.cluster([
self.vectorize(corpus.words(fileid)) for fileid in corpus.fileids(categories=['design', 'books'])
])
def classify(self, document):
"""
Pass through to the internal model classify
"""
return self.model.classify(self.vectorize(document))
In [12]:
start = time.time()
clusterer = KMeansTopics(corpus)
clusterer.cluster(corpus)
print("Took {:0.3f} seconds to construct clusters".format(time.time() - start))
In [13]:
# Classify documents in corpus by cluster affinity
groups = [
(clusterer.classify(corpus.words(fileid)), fileid)
for fileid in corpus.fileids(categories=['design', 'books'])
]
# Group documents in corpus by cluster and display them
groups.sort(key=itemgetter(0))
for group, items in groupby(groups, key=itemgetter(0)):
for item in items:
print("{}: {}".format(*item))
In [47]:
# This is what Gensim was made for - short, sweet, simple:
start = time.time()
documents = [
list(normalize(corpus.words(fileids=fileid)))
for fileid in corpus.fileids()
]
# Create dictionary with tid to token mappings (or alternatively load one)
id2word = gensim.corpora.Dictionary(documents)
# id2word = gensim.corpora.Dictionary.load('corpus.txt')
# Vectorize each document to create a corpus and write to disk.
path = os.path.join(ROOT, 'fixtures', 'corpus.mm')
corpus = [id2word.doc2bow(doc) for doc in documents]
gensim.corpora.MmCorpus.serialize(path, corpus)
# Force the automatic call to matrix market fitting
mm = gensim.corpora.MmCorpus(path)
# Run the LDA model
lda = gensim.models.ldamodel.LdaModel(corpus=mm, id2word=id2word, num_topics=10, update_every=1, passes=20)
print("Took {:0.3f} seconds to fit LDA model.".format(time.time()-start))
# Print the topics by most informative words.
for topic in lda.print_topics(10):
print(topic)
print("")
In [48]:
lda.save(os.path.join(ROOT, 'fixtures', 'baleen.lda'))